stroke_df = read.csv("./data/healthcare-dataset-stroke-data.csv")
# head(stroke_df)
Distribution of stroke:
dataplot10 = stroke_df %>% dplyr::count(stroke)
dataplot1 = dataplot10 %>% mutate(ntotal=sum(dataplot10$n), perc= n/ntotal)
plot1= ggplot(dataplot1, aes(x="", y=perc*100, fill=as.factor(stroke), group=as.factor(stroke)))+theme_bw()+
geom_bar(width = 1, stat = "identity") + theme_void() +
labs(x=" ",y=" ", fill=" ") +
scale_fill_brewer(palette = "Dark2",labels = c("No stroke", "Stroke"))+
geom_text( y=55, label="95.13 %", size=5)+geom_text(aes(label="4.87 %"),y=2.5, x=1.3, size=4)+
coord_polar("y", start=0) + theme(legend.text=element_text(size=15))
plot1
We could see that only 4.87% of the 5110 individuals in the dataset suffered a stroke.
#genders
dataplot2=stroke_df %>% dplyr::count(stroke, gender) %>% spread(stroke, n)
names(dataplot2)=c("gender", "neg", "pos")
dataplot2 = dataplot2 %>% mutate(perc_gender=pos/(pos+neg))
plot2 = ggplot(dataplot2 %>% filter(gender!="Other"), aes(x=gender,
y=perc_gender*100, fill=as.factor(gender),
group=as.factor(gender))) + theme_bw()+
geom_bar(stat = "identity")+
labs(title="Gender",x="",y="Probability of stroke (%)") + scale_fill_brewer(palette = "Dark2") +
theme(legend.position = "none")+ theme(text = element_text(size=13.07,colour="black"))+
theme(axis.text.x = element_text(colour="black",size=13.07))+
theme(axis.text.y = element_text(colour="black",size=13.07))
Smoking status:
dataplot3_1=stroke_df %>% dplyr::count(stroke, smoking_status) %>% spread(stroke, n)
names(dataplot3_1)=c("smoking_status", "neg", "pos")
dataplot3_1 = dataplot3_1 %>% mutate(perc_smoke=pos/(pos+neg))
plot3 = ggplot(dataplot3_1, aes(x=smoking_status,
y=perc_smoke*100, fill=as.factor(smoking_status),
group=as.factor(smoking_status))) + theme_bw()+
geom_bar(stat = "identity")+
labs(title="Smoking status",x=" ",y="Probability of stroke (%)") + scale_fill_brewer(palette = "Dark2") +
scale_x_discrete(labels=c("formerly smoked" = "Formerly smoked", "never smoked" = "Never smoked", "smokes"="Smokes")) +
theme(legend.position = "none")+ theme(text = element_text(size=13.07,colour="black"))+
theme(axis.text.x = element_text(colour="black",size=13.07))+
theme(axis.text.y = element_text(colour="black",size=13.07))
People who identified as former smokers have the highest probability of having a stroke (~8%), followed by smokers and then people who never smoked.
# hypertension
dataplot3_1a=stroke_df %>% dplyr::count(stroke, hypertension) %>% spread(stroke, n)
names(dataplot3_1a)=c("hypertension", "neg", "pos")
dataplot3_1a = dataplot3_1a %>% mutate(perc_hyp=pos/(pos+neg))
plot4 =ggplot(dataplot3_1a, aes(x=as.factor(hypertension) ,
y=perc_hyp*100, fill=as.factor(hypertension ),
group=as.factor(hypertension ))) + theme_bw()+
geom_bar(stat = "identity")+
labs(title="Hypertension", x=" ",y="Probability of stroke (%)", fill=" ") +
scale_fill_brewer(palette = "Dark2") +
scale_x_discrete(breaks=c("0","1"), labels=c("0" = "No hypertension", "1" = "Hypertension")) + theme(legend.position = "none")+
theme(text = element_text(size=13.07,colour="black"))+
theme(axis.text.x = element_text(colour="black",size=13.07))+
theme(axis.text.y = element_text(colour="black",size=13.07))
#heart disease
dataplot3_1b=stroke_df %>% dplyr::count(stroke, heart_disease) %>% spread(stroke, n)
names(dataplot3_1b)=c("heart_disease", "neg", "pos")
dataplot3_1b = dataplot3_1b %>% mutate(perc_hd=pos/(pos+neg))
plot5 = ggplot(dataplot3_1b, aes(x=as.factor(heart_disease) ,
y=perc_hd*100, fill=as.factor(heart_disease ),
group=as.factor(heart_disease ))) + theme_bw()+
geom_bar(stat = "identity")+
labs(title="Heart disease",x="", y="Probability of stroke (%)", fill="Heart disease") +
scale_fill_brewer(palette = "Dark2") +
scale_x_discrete(breaks=c("0","1"), labels=c("0" = "No HD", "1" = "HD")) + theme(legend.position = "none")+
theme(text = element_text(size=13.07,colour="black"))+
theme(axis.text.x = element_text(colour="black",size=13.07))+
theme(axis.text.y = element_text(colour="black",size=13.07))
#ever_married
dataplot3_1c=stroke_df %>% dplyr::count(stroke, ever_married) %>% spread(stroke, n)
names(dataplot3_1c)=c("ever_married", "neg", "pos")
dataplot3_1c = dataplot3_1c %>% mutate(perc_em=pos/(pos+neg))
plot6 = ggplot(dataplot3_1c, aes(x=ever_married ,
y=perc_em*100, fill=as.factor(ever_married ),
group=as.factor(ever_married ))) + theme_bw()+
geom_bar(stat = "identity")+
labs(title="Ever married",x="", y="Probability of stroke (%)", fill=" ") +
scale_fill_brewer(palette = "Dark2") +
theme(legend.position = "none")+
theme(text = element_text(size=13.07,colour="black"))+
theme(axis.text.x = element_text(colour="black",size=13.07))+
theme(axis.text.y = element_text(colour="black",size=13.07))
# work type
dataplot3_1d= stroke_df %>% dplyr::count(stroke, work_type) %>% spread(stroke, n)
names(dataplot3_1d)=c("work_type", "neg", "pos")
dataplot3_1d = dataplot3_1d %>% mutate(perc_wt=pos/(pos+neg))
plot7=ggplot(dataplot3_1d %>% filter(work_type!="Never_worked"), aes(x=work_type ,
y=perc_wt*100, fill=as.factor(work_type ),
group=as.factor(work_type ))) + theme_bw()+
geom_bar(stat = "identity")+
labs(title="Work type",x=" ",y="Probability of stroke (%)", fill=" ") +
scale_fill_brewer(palette = "Dark2") +
scale_x_discrete(labels=c("children" = "Children", "Govt_job" = "Gov. Job")) +
theme(legend.position = "none")+
theme(text = element_text(size=13.07,colour="black"))+
theme(axis.text.x = element_text(colour="black",size=13.07))+
theme(axis.text.y = element_text(colour="black",size=13.07))
#residence type
dataplot3_1e=stroke_df %>% dplyr::count(stroke, Residence_type) %>% spread(stroke, n)
names(dataplot3_1e)=c("Residence_type", "neg", "pos")
dataplot3_1e = dataplot3_1e %>% mutate(perc_rt=pos/(pos+neg))
plot8 = ggplot(dataplot3_1e, aes(x=Residence_type ,
y=perc_rt*100, fill=as.factor(Residence_type ),
group=as.factor(Residence_type ))) + theme_bw()+
geom_bar(stat = "identity")+
labs(title="Residence type",x=" ", y="Probability of stroke (%)", fill=" ") +
scale_fill_brewer(palette = "Dark2") +
theme(legend.position = "none")+
theme(text = element_text(size=13.07,colour="black"))+
theme(axis.text.x = element_text(colour="black",size=13.07))+
theme(axis.text.y = element_text(colour="black",size=13.07))
Categorical variables:
#figures
allplotslist_1 <- align_plots(plot2, plot3, plot4, plot5, plot6, plot7, plot8, align = "hv")
grid_1=grid.arrange(allplotslist_1[[1]],allplotslist_1[[2]],
allplotslist_1[[3]],allplotslist_1[[4]],
allplotslist_1[[5]], allplotslist_1[[6]],nrow = 3)
Continuous variable:
#age
plot9 =
stroke_df %>%
ggplot() +
geom_density(aes(x=age , group=as.factor(stroke),fill=as.factor(stroke)),
size=1,alpha=0.5, adjust=2) +
theme_bw()+
ylab("Density")+ labs(fill=' ',x="Age") +
scale_fill_brewer(palette = "Dark2",labels = c("No stroke", "Stroke"))+
theme(text = element_text(size=13.07,colour="black"))+
theme(axis.text.x = element_text(colour="black",size=13.07))+
theme(axis.text.y = element_text(colour="black",size=13.07))
# bmi
plot10 =
stroke_df %>%
ggplot() +
geom_density(aes(x=bmi, group=as.factor(stroke),fill=as.factor(stroke)),
size=1,alpha=0.5, adjust=2) +
theme_bw()+
ylab("Density")+ labs(fill=' ',x="BMI") +
scale_fill_brewer(palette = "Dark2",labels = c("No stroke", "Stroke"))+
theme(text = element_text(size=13.07,colour="black"))+
theme(axis.text.x = element_text(colour="black",size=13.07))+
theme(axis.text.y = element_text(colour="black",size=13.07))
# avg_glucose_level
plot11 =
stroke_df %>%
ggplot() +
geom_density(aes(x=avg_glucose_level , group=as.factor(stroke),fill=as.factor(stroke)),
size=1,alpha=0.5, adjust=2) +
theme_bw()+
ylab("Density")+ labs(fill=' ',x="Avg. glucose level") +
scale_fill_brewer(palette = "Dark2",labels = c("No stroke", "Stroke"))+
theme(text = element_text(size=13.07,colour="black"))+
theme(axis.text.x = element_text(colour="black",size=13.07))+
theme(axis.text.y = element_text(colour="black",size=13.07))
#combine plots
allplotslist_2 <- align_plots(plot9, plot10, plot11, align = "hv")
grid_3=grid.arrange(allplotslist_2[[1]],allplotslist_2[[2]],
allplotslist_2[[3]],ncol = 3)
Comment: From these plots we can see that:
Formerly smokers are more prone to suffer a stroke than smokers. This could be due to the fact that former smokers quit after acquiring health conditions that raised their risk of having a stroke.
Self-employed are under higher risk of suffering a stroke than private and government jobs. Maybe due to higher stress and lack of insurance that are results of being self-employed?
Urban residents, males and people with hypertension or heart disease are prone to suffer a stroke. In addition, people who have been married are also more likely to suffer a stroke than the single people.
Age seems to be an important factor, with higher age comes higher chance of having a stroke. There are far more people who developed a stroke that have high glucose level than people with low glucose level.
stroke_df$stroke = as.factor(stroke_df$stroke)
stroke_df$gender = factor(stroke_df$gender) %>% as.numeric()
stroke_df$ever_married = factor(stroke_df$ever_married) %>% as.numeric()
stroke_df$work_type = factor(stroke_df$work_type) %>% as.numeric()
stroke_df$Residence_type = factor(stroke_df$Residence_type) %>% as.numeric()
stroke_df$smoking_status = factor(stroke_df$smoking_status) %>% as.numeric()
stroke_df$heart_disease = factor(stroke_df$heart_disease) %>% as.numeric()
stroke_df$hypertension = as.numeric(factor(stroke_df$hypertension))
stroke_df$work_type = as.factor(stroke_df$work_type) %>% as.numeric()
stroke_df$bmi = as.numeric(stroke_df$bmi)
## Warning: NAs introduced by coercion
stroke_df = stroke_df[, -1] %>%
mutate(stroke = recode(stroke,
`0` = "No",
`1` = "Yes"),
stroke = factor(stroke)) %>%
filter(gender < 3)
summary(stroke_df)
## gender age hypertension heart_disease
## Min. :1.000 Min. : 0.08 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:1.000
## Median :1.000 Median :45.00 Median :1.000 Median :1.000
## Mean :1.414 Mean :43.23 Mean :1.097 Mean :1.054
## 3rd Qu.:2.000 3rd Qu.:61.00 3rd Qu.:1.000 3rd Qu.:1.000
## Max. :2.000 Max. :82.00 Max. :2.000 Max. :2.000
##
## ever_married work_type Residence_type avg_glucose_level
## Min. :1.000 Min. :1.000 Min. :1.000 Min. : 55.12
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.: 77.24
## Median :2.000 Median :4.000 Median :2.000 Median : 91.88
## Mean :1.656 Mean :3.495 Mean :1.508 Mean :106.14
## 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:114.09
## Max. :2.000 Max. :5.000 Max. :2.000 Max. :271.74
##
## bmi smoking_status stroke
## Min. :10.30 Min. :1.000 No :4860
## 1st Qu.:23.50 1st Qu.:2.000 Yes: 249
## Median :28.10 Median :2.000
## Mean :28.89 Mean :2.586
## 3rd Qu.:33.10 3rd Qu.:4.000
## Max. :97.60 Max. :4.000
## NA's :201
vis_miss(stroke_df)
head(stroke_df)
## gender age hypertension heart_disease ever_married work_type Residence_type
## 1 2 67 1 2 2 4 2
## 2 1 61 1 1 2 5 1
## 3 2 80 1 2 2 4 1
## 4 1 49 1 1 2 4 2
## 5 1 79 2 1 2 5 1
## 6 2 81 1 1 2 4 2
## avg_glucose_level bmi smoking_status stroke
## 1 228.69 36.6 1 Yes
## 2 202.21 NA 2 Yes
## 3 105.92 32.5 2 Yes
## 4 171.23 34.4 3 Yes
## 5 174.12 24.0 2 Yes
## 6 186.21 29.0 1 Yes
set.seed(123)
trRow = createDataPartition(y = stroke_df$stroke, p = 0.7, list = F)
train.data = stroke_df[trRow, ]
test.data = stroke_df[-trRow, ]
preProcess()knnImp = preProcess(train.data, method = "knnImpute", k = 3)
train.data = predict(knnImp, train.data)
vis_miss(train.data)
test.data = predict(knnImp,test.data)
vis_miss(test.data)
Try following models to see which algorithm fits the best because our outcome is binary and it would better to proceed with which classification performs the best. We will have accuracy and ROC/AUC as our evaluation metrics.
glm.fit <- glm(stroke ~ .,
data = stroke_df,
subset = trRow,
family = binomial(link = "logit"))
summary(glm.fit)
#prediction
test.pred.prob <- predict(glm.fit, newdata = test.data,
type = "response")
test.pred <- rep("No", length(test.pred.prob))
test.pred[test.pred.prob>0.5] <- "Yes"
head(test.pred)
# 2x2 table
confusionMatrix(data = as.factor(test.pred),
reference = stroke_df$stroke[-trRow],
positive = "Yes")
roc.glm <- roc(stroke_df$stroke[-trRow], test.pred.prob)
plot(roc.glm, legacy.axes = TRUE, print.auc = TRUE)
plot(smooth(roc.glm), col = 4, add = TRUE)
# Accuracy = 0.9517 = proportion of corrected class = (TN + TP) / total , 95% CI = (0.9397, 0.9619)
# NIR = 0.9517 = maximum (observed negative rate, observed positive rate). Extreme case: all observed y is 0 (neg)=> NIR = 1. Extremely unbalanced class => NIR is very high. One of the class has much larger probability (in this case negative class), then come up with a very simple classifier (assign all class label to be negative) then it makes negative prediction to all obs, but accuracy is very high still because it still can make prediction right . Accuracy is not meaningful for highly imbalanced data.
# p-value = 0.5309. Kappa = 0. Null hypo: Accuracy = NIR. pvalue[Acc > NIR] = 0.5309 >>> 0.05, fail to reject that Accuracy is better than NIR.
# Kappa = measures agreement between observed labels and predictor labels. Can account for probability of agreement by chance. Here Kappa = 0 =>
# Using caret
ctrl <- trainControl(method = "cv",
summaryFunction = twoClassSummary,
classProbs = TRUE)
set.seed(1)
model.glm <- train(x = train.data[, c(1:10)],
y = train.data$stroke,
method = "glm",
metric = "ROC",
trControl = ctrl)
glm.pred = predict(model.glm, newdata = test.data, type = "prob")
glm.prob = ifelse(glm.pred$Yes > 0.5, "Yes", "No")
confusionMatrix(data = as.factor(glm.prob),
reference = test.data$stroke,
positive = "Yes")
## Warning in confusionMatrix.default(data = as.factor(glm.prob), reference =
## test.data$stroke, : Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1458 74
## Yes 0 0
##
## Accuracy : 0.9517
## 95% CI : (0.9397, 0.9619)
## No Information Rate : 0.9517
## P-Value [Acc > NIR] : 0.5309
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.9517
## Prevalence : 0.0483
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Yes
##
roc.glm = roc(test.data$stroke, glm.pred[,2])
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
auc.glm = roc.glm$auc[1]
auc.glm
## [1] 0.8423516
plot(roc.glm, legacy.axes = TRUE, print.auc = TRUE)
plot(smooth(roc.glm), col = 4, add = TRUE)
## 161-nearest neighbor model
## Training set outcome distribution:
##
## No Yes
## 3402 175
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1458 74
## Yes 0 0
##
## Accuracy : 0.9517
## 95% CI : (0.9397, 0.9619)
## No Information Rate : 0.9517
## P-Value [Acc > NIR] : 0.5309
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.9517
## Prevalence : 0.0483
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Yes
##
## [1] 0.8195927
set.seed(1)
model.gam <- train(x = train.data[,c(1:11)],
y = train.data$stroke,
method = "gam",
metric = "ROC",
trControl = ctrl)
model.gam$finalModel
##
## Family: binomial
## Link function: logit
##
## Formula:
## .outcome ~ gender + hypertension + heart_disease + ever_married +
## Residence_type + smoking_status + work_type + s(age) + s(bmi) +
## s(avg_glucose_level)
##
## Estimated degrees of freedom:
## 3.7044 0.0017 0.8089 total = 12.51
##
## UBRE score: -0.6829292
gam.pred = predict(model.gam, newdata = test.data, type = "prob")
gam.prob = ifelse(gam.pred$Yes > 0.5, "Yes", "No")
confusionMatrix(data = as.factor(gam.prob),
reference = test.data$stroke,
positive = "Yes")
## Warning in confusionMatrix.default(data = as.factor(gam.prob), reference =
## test.data$stroke, : Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1458 74
## Yes 0 0
##
## Accuracy : 0.9517
## 95% CI : (0.9397, 0.9619)
## No Information Rate : 0.9517
## P-Value [Acc > NIR] : 0.5309
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.9517
## Prevalence : 0.0483
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Yes
##
roc.gam = roc(test.data$stroke, gam.pred[,2])
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
auc.gam = roc.gam$auc[1]
auc.gam
## [1] 0.8431858
plot(roc.gam, legacy.axes = TRUE, print.auc = TRUE)
plot(smooth(roc.gam), col = 4, add = TRUE)
partimat(stroke ~ avg_glucose_level + age + bmi,
data = stroke_df, subset = trRow, method = "lda")
Logistic regression assumes that the data is linearly separable in space but decision trees do not. Decision trees also handle skewed data better.
# based on cv
res <- resamples(list(GLM = model.glm,
GAM = model.gam,
KNN = model.knn))
summary(res)
##
## Call:
## summary.resamples(object = res)
##
## Models: GLM, GAM, KNN
## Number of resamples: 10
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## GLM 0.7294118 0.8085310 0.8422962 0.8354419 0.8775695 0.8987889 0
## GAM 0.7158497 0.8104623 0.8460400 0.8346551 0.8759454 0.8959150 0
## KNN 0.6936275 0.7876946 0.8112697 0.8020851 0.8266307 0.8522320 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## GLM 1 1 1 1 1 1 0
## GAM 1 1 1 1 1 1 0
## KNN 1 1 1 1 1 1 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## GLM 0 0 0 0 0 0 0
## GAM 0 0 0 0 0 0 0
## KNN 0 0 0 0 0 0 0
bwplot(res, metric = c("ROC", "AUC"))
# GLM and GAM perform better compared to KNN. KNN usually requires a larger dataset to perform as good as model with symmeratric structure.
#1st column is probability of negative, 2nd is positive
glm.pred <- predict(model.glm, newdata = test.data, type = "prob")[,2]
knn.pred <- predict(model.knn, newdata = test.data, type = "prob")[,2]
gam.pred <- predict(model.gam, newdata = test.data, type = "prob")[,2]
roc.glm <- roc(test.data$stroke, glm.pred)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
roc.knn <- roc(test.data$stroke, knn.pred)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
roc.gam <- roc(test.data$stroke, gam.pred)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
auc <- c(roc.glm$auc[1], roc.knn$auc[1],
roc.gam$auc[1])
plot(roc.glm, legacy.axes = TRUE)
plot(roc.knn, col = 2, add = TRUE)
plot(roc.gam, col = 3, add = TRUE)
modelNames <- c("glm","knn","gam")
legend("bottomright", legend = paste0(modelNames, ": ", round(auc,3)),
col = 1:3, lwd = 2)